% updated 07/07/2017
% Services Division
% model 1: Y = lam*A*Y + rho*Y_ttl + X*b + error
%% CONTROL PANEL
clear
parentpath = cd(cd('..'));

warning('ON','msg:id')
dataset = 'cati_merged_sdc_rnd'; % Choose from: {'cati','sdc_rnd','sdc_rnd_ctt','cati_merged_sdc_rnd','cati_merged_sdc_rnd_ctt'}

rndlink = 0; % 1 if link duration is random, 0 if link duration is fixed
lnkyear = 5; % Choose from: {3,4,5,6,7}.
taxcred = 0; % 1 if use tax credit as IV for RND stock
jaffepx = 1; % 1 if use jaffe proximity, 0 if use mahalanobis proximity
    
timelag = 1; % time lag of RND stock
sicdgit = 1; % Choose from: {1,10,100,1000}

fac = 1e3; % scale factor for output

yr0 = 1962;
yr1 = 1966;
yr2 = 2006;

% drop the first t0 years (i.e. the starting year is (yr1+t0))
t0 = 1;
%%
switch dataset
    case 'cati'
        n = 11093;
    case 'sdc_rnd'
        n = 12253;
    case 'sdc_rnd_ctt'
        n = 21387;
    case 'cati_merged_sdc_rnd'
        n = 19654;
    case 'cati_merged_sdc_rnd_ctt'
        n = 27147;
    otherwise
        disp('No appropriate dataset specified.')
end
T = yr2-yr1+1;
%%
ID = zeros(n,2);
ID(:,1) = (1:n)';
for s = 1:T
    yr = yr1+s-1;
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr) '_ID SIC SALES PROFIT RND EMPL FIXED_CAPITAL TOT_COST.csv'];
    flid = fopen(file);
    dat0 = textscan(flid,'%s %s %s %s %s %s %s %s','delimiter',';','headerlines',1);
    fclose(flid);
    %---------------------------------
    FID = str2double(strtrim(dat0{1})); % FID
    SIC = str2double(strtrim(dat0{2})); % SIC
    FID(isnan(FID)) = 0;
    SIC(isnan(SIC)) = 0;
    SIC(floor(SIC/1000)==0) = 0; % drop firms with less than 4-digit SIC
    
    for i = 1:n
        if SIC(FID==i) > 0
            if SIC(FID==i) ~= ID(i,2)
                if ID(i,2) == 0
                    ID(i,2) = SIC(FID==i);
                else
                    ID(i,2) = -1;
                    %warning('msg:id','SIC code changes over time')
                end
            end
        end
    end
end
ID(:,2) = floor(ID(:,2)/sicdgit);
ID = ID(ID(:,2)>0,:); % drop firms with missing SIC
n = size(ID,1);

file = [parentpath '/' dataset '/location/' 'ID NATION_ID NATION CITY LOC_X LOC_Y.csv'];
flid = fopen(file);
dat0 = textscan(flid,'%s %s %s %s %s %s','delimiter',';','headerlines',1);
fclose(flid);
tmp1 = str2double(strtrim(dat0{1}));
tmp2 = str2double(strtrim(dat0{2}));

loc = zeros(size(ID,1),1);
for i = 1:n
    if sum(tmp1==ID(i,1)) == 1
        loc(i) = tmp2(tmp1==ID(i,1));
    end
end
ID = ID(loc==235,:);
n = size(ID,1);
%%
dm = cell(T,1);
X1 = cell(T,1);
X2 = cell(T,1);
X3 = cell(T,1);
for s = 1:T
    yr = yr1+s-1;
    %---------------------------------
    firm = zeros(n,3);
    %---------------------------------
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr) '_ID OUTPUT.csv'];
    flid = fopen(file);
    dat1 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    fid1 = str2double(strtrim(dat1{1}));
    outp = str2double(strtrim(dat1{2}));
    %---------------------------------
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr-timelag) '_ID RND_STOCK.csv'];
    flid = fopen(file);
    dat2 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    fid2 = str2double(strtrim(dat2{1}));
    stck = str2double(strtrim(dat2{2}));
    %---------------------------------
    file = [parentpath '/' dataset '/tax/bloom_firm_id_year_tax.csv'];
    flid = fopen(file);
    dat3 = textscan(flid,'%s %s %s','delimiter',',','headerlines',1);
    fclose(flid);
    tmp3 = str2double(strtrim(dat3{2}));
    dat3 = [str2double(strtrim(dat3{1})),str2double(strtrim(dat3{3}))];
    dat3 = dat3(tmp3==(yr-timelag),:);
    fid3 = dat3(:,1);
    taxc = dat3(:,2);
    %---------------------------------
    for i = 1:n
        if (sum(fid1==ID(i,1))==1)&&(sum(fid2==ID(i,1))==1)
            firm(i,1) = outp(fid1==ID(i,1))/fac;
            firm(i,2) = stck(fid2==ID(i,1));
            
            if (sum(fid3==ID(i,1))==1)
                firm(i,3) = taxc(fid3==ID(i,1));
            else
                firm(i,3) = 0;
            end
        end
    end
    firm(isnan(firm)) = 0;
    %---------------------------------
    file = [parentpath '/' dataset '/tot_output_compustat/' int2str(yr) '_Compustat_US_deflated_TOT_OUTPUT.csv'];
    flid  = fopen(file);
    dta1 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    sec1 = [str2double(strtrim(dta1{1})),str2double(strtrim(dta1{2}))/fac];
    sec1(isnan(sec1)) = 0;
    sec1(:,1) = floor(sec1(:,1)/sicdgit);
    %---------------------------------
    file = [parentpath '/' dataset '/tot_rnd_stock/' int2str(yr-timelag) '_Compustat_US_TOT_RND_STOCK.csv'];
    flid  = fopen(file);
    dta2 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    sec2 = [str2double(strtrim(dta2{1})),str2double(strtrim(dta2{2}))];
    sec2(isnan(sec2)) = 0;
    sec2(:,1) = floor(sec2(:,1)/sicdgit);
    %---------------------------------
    file = [parentpath '/' dataset '/tax/' int2str(yr-timelag) '_Bloom_US_TOT_TAX.csv'];
    flid  = fopen(file);
    dta3 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    sec3 = [str2double(strtrim(dta3{1})),str2double(strtrim(dta3{2}))];
    sec3(isnan(sec3)) = 0;
    sec3(:,1) = floor(sec3(:,1)/sicdgit);
    %---------------------------------
    % add the sector-level data to the firm-level data
    aggr = zeros(n,3);
    for i = 1:n
        aggr(i,1) = sum(sec1(sec1(:,1)==ID(i,2),2))-firm(i,1);
        aggr(i,2) = sum(sec2(sec2(:,1)==ID(i,2),2))-firm(i,2);
        aggr(i,3) = sum(sec3(sec3(:,1)==ID(i,2),2))-firm(i,3);
    end
    %---------------------------------
    % load RND adjacency matrix
    if rndlink == 1
        file = [parentpath '/' dataset '/adjacency_matrix_rand_' num2str(lnkyear) '_years/' int2str(yr) '_adjacency_matrix.mat'];
    else
        file = [parentpath '/' dataset '/adjacency_matrix_' num2str(lnkyear) '_years/' int2str(yr) '_adjacency_matrix.mat'];
    end
    load(file)
    A = A(ID(:,1),:);
    A = A(:,ID(:,1));
    %---------------------------------
    % indicate firms with missing observations
    dm{s} = ones(n,1);
    div = floor(ID(:,2)/1000);
    for i = 1:n
        if (min(firm(i,1:2))<=0)||(min(aggr(i,1:3))<0)
            dm{s}(i) = 0;
        end
        if (div(i)<7)||(div(i)>8) % only services firms
            dm{s}(i) = 0;
        end
    end
    %---------------------------------
    % define variables
    X1{s} = firm;
    X2{s} = A*firm;
    X3{s} = aggr;
end
%%
% drop the first t0 year(s)
dt = cat(1,dm{:});
dt = dt(n*t0+1:end);
T1 = T-t0;
%---------------------------------
% drop firms that appear less than twice
di = (1:n*T1)';
dj = kron(ones(T1,1),(1:n)');
Dn = sparse(di,dj,dt);
dd = sum(Dn,1);
Dn = Dn(:,dd>1); % firms appearing more than once in the panel
np = size(Dn,2); % that number of firms

En = speye(n);
En = En(dd>1,:); % firms appearing more than once in the panel
ID = En*ID;
for s = 1:T
    dm{s} = En*dm{s};
    X1{s} = En*X1{s};
    X2{s} = En*X2{s};
    X3{s} = En*X3{s};
end
%%
D1 = cell(T1,1);
D2 = cell(T1,1);
Yn = cell(T1,1);
Zn = cell(T1,1);
Qn = cell(T1,1);
pid = cell(T1,1);
tid = cell(T1,1);
for s = (t0+1):T
    Da = diag(dm{s});
    Da = Da(sum(Da,2)==1,:);
    Db = zeros(size(Da,1),T1);
    Db(:,s-t0) = 1;
    D1{s-t0} = Da;
    D2{s-t0} = Db;    
    
    Yn{s-t0} = Da*X1{s}(:,1);
    Zn{s-t0} = Da*[X2{s}(:,1),X3{s}(:,1),X1{s}(:,2)];
    if taxcred == 1
        Qn{s-t0} = Da*[X2{s}(:,3),X3{s}(:,3),X1{s}(:,3)];
    else
        Qn{s-t0} = Da*[X2{s}(:,2),X3{s}(:,2),X1{s}(:,2)];
    end
    pid{s-t0} = Da*ID(:,1);
    tid{s-t0} = (s-t0)*ones(size(Da,1),1);
end

D1 = cat(1,D1{:});
D2 = cat(1,D2{:});
Yn = cat(1,Yn{:});
Zn = cat(1,Zn{:});
Qn = cat(1,Qn{:});

temp = [cat(1,pid{:}),cat(1,tid{:}),Yn,Zn,Qn];
name = cell(1,size(temp,2));
name{1} = 'pid';
name{2} = 'tid';
name{3} = 'y';
for j = 1:size(Zn,2)
    name{j+3} = ['z' num2str(j)];
end
for j = 1:size(Qn,2)
    name{j+3+size(Zn,2)} = ['q' num2str(j)];
end

filename = ['stata_data1\data1_s_' num2str(rndlink) '_' num2str(lnkyear) '_' num2str(taxcred) '.csv'];
delete(filename);
csvwrite_with_headers(filename,temp,name);